/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.io;
import java.io.*;
import java.util.Arrays;
import net.nutch.util.*;
/** A file-based map from keys to values.
*
* <p>A map is a directory containing two files, the <code>data</code> file,
* containing all keys and values in the map, and a smaller <code>index</code>
* file, containing a fraction of the keys. The fraction is determined by
* {@link Writer#getIndexInterval()}.
*
* <p>The index file is read entirely into memory. Thus key implementations
* should try to keep themselves small.
*
* <p>Map files are created by adding entries in-order. To maintain a large
* database, perform updates by copying the previous version of a database and
* merging in a sorted change list, to create a new version of the database in
* a new file. Sorting large change lists can be done with {@link
* SequenceFile.Sorter}.
*/
public class MapFile {
/** The name of the index file. */
public static final String INDEX_FILE_NAME = "index";
/** The name of the data file. */
public static final String DATA_FILE_NAME = "data";
protected MapFile() {} // no public ctor
/** Writes a new map. */
public static class Writer {
private SequenceFile.Writer data;
private SequenceFile.Writer index;
private int indexInterval = 128;
private long size;
private LongWritable position = new LongWritable();
// the following fields are used only for checking key order
private WritableComparator comparator;
private DataInputBuffer inBuf = new DataInputBuffer();
private DataOutputBuffer outBuf = new DataOutputBuffer();
private WritableComparable lastKey;
/** Create the named map for keys of the named class. */
public Writer(String dirName, Class keyClass, Class valClass)
throws IOException {
this(dirName, new WritableComparator(keyClass), valClass);
}
/** Create the named map using the named key comparator. */
public Writer(String dirName, WritableComparator comparator,
Class valClass) throws IOException {
this.comparator = comparator;
this.lastKey = comparator.newKey();
File dir = new File(dirName);
if (dir.exists())
throw new IOException("already exists: " + dir);
dir.mkdir();
File dataFile = new File(dir, DATA_FILE_NAME);
File indexFile = new File(dir, INDEX_FILE_NAME);
Class keyClass = comparator.getKeyClass();
this.data =
new SequenceFile.Writer(dataFile.getPath(), keyClass, valClass);
this.index =
new SequenceFile.Writer(indexFile.getPath(),
keyClass, LongWritable.class);
}
/** The number of entries that are added before an index entry is added.*/
public int getIndexInterval() { return indexInterval; }
/** Sets the index interval.
* @see #getIndexInterval()
*/
public void setIndexInterval(int interval) { indexInterval = interval; }
/** Close the map. */
public synchronized void close() throws IOException {
data.close();
index.close();
}
/** Append a key/value pair to the map. The key must be strictly greater
* than the previous key added to the map. */
public synchronized void append(WritableComparable key, Writable val)
throws IOException {
checkKey(key);
if (size % indexInterval == 0) { // add an index entry
position.set(data.getLength()); // point to current eof
index.append(key, position);
}
data.append(key, val); // append key/value to data
size++;
}
private void checkKey(WritableComparable key) throws IOException {
// check that keys are well-ordered
if (size != 0 && comparator.compare(lastKey, key) >= 0)
throw new IOException("key out of order: "+key+" after "+lastKey);
// update lastKey with a copy of key by writing and reading
outBuf.reset();
key.write(outBuf); // write new key
inBuf.reset(outBuf.getData(), outBuf.getLength());
lastKey.readFields(inBuf); // read into lastKey
}
}
/** Provide access to an existing map. */
public static class Reader {
private WritableComparator comparator;
private DataOutputBuffer keyBuf = new DataOutputBuffer();
private DataOutputBuffer nextBuf = new DataOutputBuffer();
private int nextKeyLen = -1;
private long seekPosition = -1;
private int seekIndex = -1;
private long firstPosition;
private WritableComparable getKey;
// the data, on disk
private SequenceFile.Reader data;
private SequenceFile.Reader index;
// whether the index Reader was closed
private boolean indexClosed = false;
// the index, in memory
private int count = -1;
private WritableComparable[] keys;
private long[] positions;
/** Returns the class of keys in this file. */
public Class getKeyClass() { return data.getKeyClass(); }
/** Returns the class of values in this file. */
public Class getValueClass() { return data.getValueClass(); }
/** Construct a map reader for the named map.*/
public Reader(String dirName) throws IOException {
this(dirName, null);
}
/** Construct a map reader for the named map using the named comparator.*/
public Reader(String dirName, WritableComparator comparator)
throws IOException {
File dir = new File(dirName);
File dataFile = new File(dir, DATA_FILE_NAME);
File indexFile = new File(dir, INDEX_FILE_NAME);
// open the data
this.data = new SequenceFile.Reader(dataFile.getPath());
this.firstPosition = data.getPosition();
if (comparator == null)
this.comparator = new WritableComparator(data.getKeyClass());
else
this.comparator = comparator;
this.getKey = this.comparator.newKey();
// open the index
this.index = new SequenceFile.Reader(indexFile.getPath());
}
private void readIndex() throws IOException {
// read the index entirely into memory
if (this.keys != null)
return;
this.count = 0;
this.keys = new WritableComparable[1024];
this.positions = new long[1024];
try {
LongWritable position = new LongWritable();
WritableComparable lastKey = null;
while (true) {
WritableComparable k = comparator.newKey();
if (!index.next(k, position))
break;
// check order to make sure comparator is compatible
if (lastKey != null && comparator.compare(lastKey, k) >= 0)
throw new IOException("key out of order: "+k+" after "+lastKey);
lastKey = k;
if (count == keys.length) { // time to grow arrays
int newLength = (keys.length*3)/2;
WritableComparable[] newKeys = new WritableComparable[newLength];
long[] newPositions = new long[newLength];
System.arraycopy(keys, 0, newKeys, 0, count);
System.arraycopy(positions, 0, newPositions, 0, count);
keys = newKeys;
positions = newPositions;
}
keys[count] = k;
positions[count] = position.get();
count++;
}
} catch (EOFException e) {
SequenceFile.LOG.warning("Unexpected EOF reading " + index +
" at entry #" + count + ". Ignoring.");
} finally {
indexClosed = true;
index.close();
}
}
/** Re-positions the reader before its first key. */
public synchronized void reset() throws IOException {
data.seek(firstPosition);
}
/** Positions the reader at the named key, or if none such exists, at the
* first entry after the named key. Returns true iff the named key exists
* in this map.
*/
public synchronized boolean seek(WritableComparable key)
throws IOException {
readIndex(); // make sure index is read
keyBuf.reset(); // write key to keyBuf
key.write(keyBuf);
if (seekIndex != -1 // seeked before
&& seekIndex+1 < count
&& comparator.compare(key,keys[seekIndex+1])<0 // before next indexed
&& comparator.compare(keyBuf.getData(), 0, keyBuf.getLength(),
nextBuf.getData(), 0, nextKeyLen)
>= 0) { // but after last seeked
// do nothing
} else {
seekIndex = binarySearch(key);
if (seekIndex < 0) // decode insertion point
seekIndex = -seekIndex-2;
if (seekIndex == -1) // belongs before first entry
seekPosition = firstPosition; // use beginning of file
else
seekPosition = positions[seekIndex]; // else use index
}
data.seek(seekPosition);
while ((nextKeyLen = data.next(nextBuf.reset())) != -1) {
int c = comparator.compare(keyBuf.getData(), 0, keyBuf.getLength(),
nextBuf.getData(), 0, nextKeyLen);
if (c <= 0) { // at or beyond desired
data.seek(seekPosition); // back off to previous
return c == 0;
}
seekPosition = data.getPosition();
}
return false;
}
private int binarySearch(WritableComparable key) {
int low = 0;
int high = count-1;
while (low <= high) {
int mid = (low + high) >> 1;
WritableComparable midVal = keys[mid];
int cmp = comparator.compare(midVal, key);
if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
return mid; // key found
}
return -(low + 1); // key not found.
}
/** Read the next key/value pair in the map into <code>key</code> and
* <code>val</code>. Returns true if such a pair exists and false when at
* the end of the map */
public synchronized boolean next(WritableComparable key, Writable val)
throws IOException {
return data.next(key, val);
}
/** Return the value for the named key, or null if none exists. */
public synchronized Writable get(WritableComparable key, Writable val)
throws IOException {
if (seek(key)) {
next(getKey, val); // don't smash key
return val;
} else
return null;
}
/** Close the map. */
public synchronized void close() throws IOException {
if (! indexClosed) {
index.close();
}
data.close();
}
}
/** Renames an existing map directory. */
public static void rename(String oldName, String newName)
throws IOException {
File oldDir = new File(oldName);
File newDir = new File(newName);
if (!oldDir.renameTo(newDir))
throw new IOException("Could not rename " + oldDir + " to " + newDir);
}
/** Deletes the named map file. */
public static void delete(String name) throws IOException {
File dir = new File(name);
File data = new File(dir, DATA_FILE_NAME);
File index = new File(dir, INDEX_FILE_NAME);
data.delete();
index.delete();
dir.delete();
}
public static void main(String[] args) throws Exception {
String usage = "Usage: MapFile inFile outFile";
if (args.length != 2) {
System.err.println(usage);
System.exit(-1);
}
String in = args[0];
String out = args[1];
MapFile.Reader reader = new MapFile.Reader(in);
MapFile.Writer writer =
new MapFile.Writer(out, reader.getKeyClass(), reader.getValueClass());
WritableComparable key =
(WritableComparable)reader.getKeyClass().newInstance();
Writable value = (Writable)reader.getValueClass().newInstance();
while (reader.next(key, value)) // copy all entries
writer.append(key, value);
writer.close();
}
}